# 1. Convert wig to bed
rule convert_wig_to_bed:
    input:
        indir="~/phastCons100way/"
    shell:
        '''        
        for CHR in {{1..22}} X Y M
        do
            wigFix="{input.indir}/chr${{CHR}}.phastCons100way.wigFix.gz"
            outbed="{input.indir}/chr${{CHR}}.phastCons100way.hg38.bed.gz"
            zcat ${{wigFix}} | convert2bed --input=wig - --max-mem=20G | awk -F"\t" '{{print $1"\t"$2"\t"$3"\t"$1"_"$2"_"$3"\t"$5}}' | pigz > ${{outbed}}
        done
        wait
        '''

# 2. Split bed files to chunks
rule split_bed_to_chunks:
    input:
        indir="~/phastCons100way/"
    output:
        outdir="~/phastCons100way_chunks/"
    params: nr_chunk=1000000
    shell:
        '''
        mkdir -p {output.outdir}
        for CHR in {{1..22}} X Y M
        do
        {{
            zcat {input.indir}/chr${{CHR}}.phastCons100way.hg38.bed.gz | split -l {params.nr_chunk} -d -a 3 - {output.outdir}/chr${{CHR}}.phastCons100way.hg38.chunk_
            ls {output.outdir}/chr${{CHR}}.phastCons100way.hg38.chunk_* | grep -v "bed" | xargs -n1 -i{{}} mv {{}} {{}}.bed
            ls {output.outdir}/chr${{CHR}}.phastCons100way.hg38.chunk_* | grep "bed" | xargs -n1 -i{{}} pigz {{}}
        }} &
        done
        wait
        '''


# 3. Run liftOver
rule run_liftover:
    input:
        chunks_dir="./phastCons100way_chunks/",
        chain="hg38ToSusScr11.over.chain.gz",
        task_idx=0 # 0..118
    output:
        outdir="output_liftOver_chunks"
    shell:
        '''
        mkdir {output.outdir}

        filelist=(`ls {input.chunks_dir}/*`)
        nfile=${{#filelist[*]}}
        
        echo "idx: {input.task_idx}"
        for fileidx in $(seq $((idx*24)) 1 $((idx*24+23)))
        do
        {{
            # if [[ $fileidx > $nfile ]] || [[ $fileidx == $nfile ]]; then
                # continue;
            # fi
            bedfile=${{filelist[fileidx]}}
            echo "bedfile: ${{bedfile}}"
            bedprefix=${{bedfile//"{input.chunks_dir}"/}}
            bedprefix=${{bedprefix//".bed.gz"/}}
            liftOver -minMatch=0.8 ${{bedfile}} {input.chain} {output.outdir}/${{bedprefix}}.hg38ToSusScr11.bed {output.outdir}/${bedprefix}.unMapped
            pigz {output.outdir}/${{bedprefix}}.hg38ToSusScr11.bed
            pigz {output.outdir}/${{bedprefix}}.unMapped
            echo "done: ${{bedfile}}"
        }} &
        done
        wait

        '''

# 4. Gene-level phastCons scores
rule get_gene_level_phasecons:
    input:
        dir_liftOver_chunks="output_liftOver_chunks", # from the output of the previous step
        gene_gtf="~/data/Sus_scrofa.Sscrofa11.1.100.simple.gtf.gz"
    output:
        "./output_phastCons_pcg/Pig.chrAll.gene_coverage_phastCons.txt"
    shell:
        '''
        ### 1. merge all bed files generated by liftOver
        mkdir ./output_hg38ToSusScr11/
        filelist=(`ls {input.dir_liftOver_chunks}/*.hg38ToSusScr11.bed.gz`)
        
        cat ${{filelist[*]}} > ./output_hg38ToSusScr11/All.hg38ToSusScr11.bed.gz
        
        
        ### 2. sort the file by chr and pos in pig genome
        for CHR in `seq 1 18`
        do
        {{
            zcat ./output_hg38ToSusScr11/All.hg38ToSusScr11.bed.gz | grep -P "chr${{CHR}}\t" | sed 's/chr//g' | sort -n -k 2 | pigz > ./output_hg38ToSusScr11/Pig.chr${{CHR}}.hg38ToSusScr11.bed.gz
        }} &
        done
        wait
        
        ### 3. make bed file for annot
        mkdir gene_annot
        for CHR in `seq 1 18`
        do
        {{
            zcat {input.gene_gtf} | awk -F"\t" -v "chr=$CHR" '{{if($1==chr && $2=="gene")print $1"\t"$3-1"\t"$4-1"\t"$5}}' > ./gene_annot/pcg_annot.chr${{CHR}}.bed
        }}
        done
        wait
        
        ### 4. calculate phastCons of protein-coding genes
        mkdir output_phastCons_pcg
        for CHR in `seq 1 18`
        do
        {{
            echo ${{CHR}}
            bedtools map -split -sorted -a ./gene_annot/pcg_annot.chr${{CHR}}.bed -b ./output_hg38ToSusScr11/Pig.chr${{CHR}}.hg38ToSusScr11.bed.gz -o mean > ./output_phastCons_pcg/Pig.chr${{CHR}}.phastCons_pcg.txt
            bedtools coverage -a ./gene_annot/pcg_annot.chr${{CHR}}.bed -b ./__Old/output_hg38ToSusScr11/Pig.chr${{CHR}}.hg38ToSusScr11.bed.gz > ./__Old/output_phastCons_pcg/Pig.chr${{CHR}}.gene_coverage.txt
        }}
        done
        wait
        
        cat ./output_phastCons_pcg/Pig.chr*.phastCons_pcg.txt > ./output_phastCons_pcg/Pig.chrAll.phastCons_pcg.txt
        # pigz ./output_phastCons_pcg/Pig.chrAll.phastCons_pcg.txt
        
        cat ./output_phastCons_pcg/Pig.chr*.gene_coverage.txt > ./output_phastCons_pcg/Pig.chrAll.gene_coverage.txt
        # pigz ./output_phastCons_pcg/Pig.chrAll.gene_coverage.txt
        
        paste ./output_phastCons_pcg/Pig.chrAll.gene_coverage.txt ./output_phastCons_pcg/Pig.chrAll.phastCons_pcg.txt | awk -F"\t" '{{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$13}}' | sed '1i #chr\tstart\tend\tgene_id\tmap_num\tmap_length\tlength\tmap_ratio\tPhastCons' > {output}
        '''

# 5. SNP-level phastCons scores
rule get_snp_level_phastcons:
    output:
        "./output_phastCons_snp/Pig.chrAll.phastCons_snp.txt.gz"
    shell:
        '''
        ### 1. create bed file for SNP map
        mkdir snp_annot
        for CHR in `seq 1 18`
        do
        {{
            geno_vcf="~/data/SNPs.chr${{CHR}}.imputed.vcf.gz"
            zcat ${{geno_vcf}} | grep -v "#" |awk -F"\t" '{{print $1"\t"$2-1"\t"$2"\t"$1"_"$2"_"$4"_"$5}}' > ./snp_annot/snp_annot.chr${{CHR}}.bed
        }} &
        done
        wait
        
        
        ### 2. calculate phastCons for SNPs
        mkdir output_phastCons_snp
        for CHR in `seq 1 18`
        do
        {{
            echo ${{CHR}}
            bedtools map -split -sorted -a ./snp_annot/snp_annot.chr${{CHR}}.bed -b ./output_hg38ToSusScr11/Pig.chr${{CHR}}.hg38ToSusScr11.bed.gz -o mean > ./output_phastCons_snp/Pig.chr${{CHR}}.phastCons_snp.txt
        }}
        done
        wait
        
        cat ./output_phastCons_snp/Pig.chr*.phastCons_snp.txt > ./output_phastCons_snp/Pig.chrAll.phastCons_snp.txt
        pigz ./output_phastCons_snp/Pig.chrAll.phastCons_snp.txt
        '''